In [1]:
import numpy as np
import pandas as pd
import cPickle as pickle
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#Analyze Stratified Results
Laying groundwork to share the methods we used for model selection and parameter tuning.  All accuracy scores will be from cross-validation data.  

##Loading data
First list all of the parameter variations we were considering.

In [None]:
import os  
logs = []
mydir = "../results"
for fn in os.listdir(mydir):
    if 'pkl' in fn:
        print fn
        log = pickle.load(open( mydir + "/" + fn, "rb" ))
        logs = logs + log

In [45]:
def get_results_df(results_path):
    log = pickle.load(open( results_path, "rb" ))
    raw_df = pd.DataFrame.from_dict(log)
    df = raw_df[['strat_column','strat_value','model','best_score','train_accuracy','test_accuracy','num_cases','num_features','num_opinion_shards']]
    return df

def print_weighted_accuracy(df):
    models = df['model'].unique()
    for model in models:
        mdf = df.loc[df['model']==model,:]
        total_cases = sum(mdf['num_cases'])
        weighted_accuracy = sum(mdf['test_accuracy']*mdf['num_cases']/total_cases)
        print "model: %s, weighted accuracy: %s%%" %(model,round(weighted_accuracy*100,1))

In [33]:
results_df = get_results_df("../results/model_results.pkl.20150510-013946.20150510-013946.min_required_count.50.all_features.accuracy.stratified")
results_df

Unnamed: 0,strat_column,strat_value,model,best_score,train_accuracy,test_accuracy,num_cases,num_features,num_opinion_shards
0,geniss,0,baseline,,1.0,1.0,45,20401,500
1,geniss,0,naive_bayes,,1.0,1.0,45,20401,500
2,geniss,0,bernoulli_bayes,,1.0,1.0,45,20401,500
3,geniss,0,logistic,,1.0,1.0,45,20401,500
4,geniss,0,svm,,1.0,1.0,45,20401,500
5,geniss,1,baseline,,0.749422,0.748268,1730,20401,500
6,geniss,1,naive_bayes,0.749422,0.749422,0.748268,1730,20401,500
7,geniss,1,bernoulli_bayes,0.749422,0.750193,0.748268,1730,20401,500
8,geniss,1,logistic,0.749422,0.749422,0.748268,1730,20401,500
9,geniss,1,svm,0.749422,0.749422,0.748268,1730,20401,500


In [47]:
print_weighted_accuracy(results_df)

model: baseline, weighted accuracy: 48.5%
model: naive_bayes, weighted accuracy: 51.0%
model: bernoulli_bayes, weighted accuracy: 53.5%
model: logistic, weighted accuracy: 53.2%
model: svm, weighted accuracy: 53.0%


In [52]:
CONTEXT='notebook'
font_size = {
    'paper':8,
    'poster':16,
    'notebook':10,
    'talk':13
}
def chart_weighted_accuracy(df,metric):
    baseline_scores = {'best_score':'train_accuracy','test_accuracy':'test_accuracy'}
    models = df['model'].unique()
    scores = []
    for model in models:
        mdf = df.loc[df['model']==model,:]
        total_cases = sum(mdf['num_cases'])
        
        if model=='baseline':
            weighted_accuracy = sum(mdf[baseline_scores[metric]]*mdf['num_cases']/total_cases)
        else:
            weighted_accuracy = sum(mdf[metric]*mdf['num_cases']/total_cases)
        scores.append(weighted_accuracy)
            
    return models,scores
        
def weighted_accuracy_bars(df,metric,context):
    '''
    df: data frame
    context: paper,talk, notebook, poster
    '''
    
    sns.set_context(context)
    model_list,score_list = chart_weighted_accuracy(df,metric)
    
    #size and position of bars
    bar_pos = np.arange(len(model_list))
    bar_size = score_list
    bar_labels = model_list
    
    #plot
    plt.barh(bar_pos,bar_size, align='center', alpha=0.4)
    plt.yticks(bar_pos, bar_labels)
    plt.xticks([],[]) #no x-axis

    #Add data labels
    for x,y in zip(bar_size,bar_pos):
        plt.text(x+0.01, y, '%.2f' % x, ha='left', va='center',fontsize=font_size[context])
        
    pretty_metric = {'test_accuracy':'Test','best_score':'CV'}
    plt.title('Optimized %s Accuracy of Each Model' % pretty_metric[metric])
    plt.show()
    
weighted_accuracy_bars(results_df,'best_score',CONTEXT)

ValueError: cannot convert float NaN to integer

<matplotlib.figure.Figure at 0x104426c50>

#Same results, but with Mixed and Unknown Valences removed

In [32]:
results_df = get_results_df("../results/model_results.pkl.20150510-022044.20150510-022044.min_required_count.50.all_features.accuracy")
results_df

Unnamed: 0,strat_column,strat_value,model,best_score,train_accuracy,test_accuracy,num_cases,num_features,num_opinion_shards
0,geniss,1,baseline,,0.811817,0.822746,3904,42327,1340
1,geniss,1,naive_bayes,0.811817,0.811817,0.822746,3904,42327,1340
2,geniss,1,bernoulli_bayes,0.811817,0.812842,0.822746,3904,42327,1340
3,geniss,1,logistic,0.832309,1.0,0.818648,3904,42327,1340
4,geniss,1,svm,0.834699,1.0,0.82582,3904,42327,1340
5,geniss,2,baseline,,0.626984,0.60339,1177,42327,1340
6,geniss,2,naive_bayes,0.626984,0.630385,0.60339,1177,42327,1340
7,geniss,2,bernoulli_bayes,0.651927,0.884354,0.633898,1177,42327,1340
8,geniss,2,logistic,0.662132,0.997732,0.667797,1177,42327,1340
9,geniss,2,svm,0.663265,0.997732,0.667797,1177,42327,1340


In [46]:
print_weighted_accuracy(results_df)

model: baseline, weighted accuracy: 48.5%
model: naive_bayes, weighted accuracy: 51.0%
model: bernoulli_bayes, weighted accuracy: 53.5%
model: logistic, weighted accuracy: 53.2%
model: svm, weighted accuracy: 53.0%


In [41]:
STRAT_COLUMN=None
print 'stratify_by_'+STRAT_COLUMN if STRAT_COLUMN else ''




In [None]:
def print_weighted_accuracy(df):
    models = df['model'].unique()
    for model in models:
        mdf = df.loc[df['model']==model,:]
        total_cases = sum(mdf['num_cases'])
        weighted_accuracy = sum(mdf['test_accuracy']*mdf['num_cases']/total_cases)
        print "model: %s, weighted accuracy: %s%%" %(model,round(weighted_accuracy*100,1))
